A Case Study on Craft Beers and Breweries across the USA

We’ve obtained data on craft beers and breweries found in the United States and offer the following analysis.

## Load packages
# install.packages("readr")
# install.packages("ggplot2")
# install.packages("ggthemes")
# install.packages("plotly")
# install.packages("stringr")
# install.packages("maps")
# install.packages("tidyverse")
# install.packages("tidyr")
# install.packages("dplyr")
# install.packages("pastecs")
# install.packages("e1071")
# install.packages("randomForest")
# install.packages("caTools")
# install.packages('plotly')

#load libraries
library(readr)
library(ggplot2)
library(ggthemes)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(stringr)
library(maps)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.3     v dplyr   1.0.7
## v tidyr   1.1.3     v forcats 0.5.1
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x dplyr::lag()    masks stats::lag()
## x purrr::map()    masks maps::map()
library(tidyr)
library(dplyr)
library(pastecs)
## 
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## The following object is masked from 'package:tidyr':
## 
##     extract
library(e1071)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(class)
library(caTools)
library(plotly)
library(maps)


#Load data
Beers <- read_csv(file.choose())
## Rows: 2410 Columns: 7
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): Name, Style
## dbl (5): Beer_ID, ABV, IBU, Brewery_id, Ounces
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
Breweries <- read_csv(file.choose())
## Rows: 558 Columns: 4
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (3): Name, City, State
## dbl (1): Brew_ID
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

A Summary of Breweries Among States

How many breweries are present in each state?

Please find the below plot which will interactively show each state’s exact number of craft breweries, followed by a summary of each in plain text and finally, with a heatmap of the United States.

#bar plot of breweries by state
brewstateplot = ggplot(Breweries,mapping = aes(x=State,fill=State)) + geom_bar() + ggtitle("Breweries present in each state")
ggplotly(brewstateplot)
summary(as.factor(Breweries$State)) # display breweries by state in plain text
## AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS 
##  7  3  2 11 39 47  8  1  2 15  7  4  5  5 18 22  3  4  5 23  7  9 32 12  9  2 
## MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 
##  9 19  1  5  3  3  4  2 16 15  6 29 25  5  4  1  3 28  4 16 10 23 20  1  4
#heatmap of breweries by state
lookup = data.frame(abb = state.abb, State = state.name)
Beer.map=Breweries
colnames(Beer.map)[4]="abb"
Beer.map2=merge(Beer.map,lookup,"abb")
Beer.mapdata=count(Beer.map2,State)
colnames(Beer.mapdata)[2]="B"
Beer.mapdata$region=tolower(Beer.mapdata$State)
Beer.mapdata2=Beer.mapdata[-1]
states <- map_data("state")
map.df <- merge(states,Beer.mapdata2, by="region", all.x=T)
map.df <- map.df[order(map.df$order),]
ggplot(map.df, aes(x=long,y=lat,group=group))+
  geom_polygon(aes(fill=B))+
  geom_path()+ 
  scale_fill_gradientn(colours=rev(heat.colors(10)),na.value="grey90")+ggtitle("Heatmap of Breweries by State")+
  coord_map()

Tidying the Raw Data

Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the merged file.

Beer.afmg = merge(Beers,Breweries, by.x = "Brewery_id", by.y = "Brew_ID") # merge by brewery id
head(Beer.afmg) # show first 6 observations
##   Brewery_id        Name.x Beer_ID   ABV IBU
## 1          1  Get Together    2692 0.045  50
## 2          1 Maggie's Leap    2691 0.049  26
## 3          1    Wall's End    2690 0.048  19
## 4          1       Pumpion    2689 0.060  38
## 5          1    Stronghold    2688 0.060  25
## 6          1   Parapet ESB    2687 0.056  47
##                                 Style Ounces            Name.y        City
## 1                        American IPA     16 NorthGate Brewing Minneapolis
## 2                  Milk / Sweet Stout     16 NorthGate Brewing Minneapolis
## 3                   English Brown Ale     16 NorthGate Brewing Minneapolis
## 4                         Pumpkin Ale     16 NorthGate Brewing Minneapolis
## 5                     American Porter     16 NorthGate Brewing Minneapolis
## 6 Extra Special / Strong Bitter (ESB)     16 NorthGate Brewing Minneapolis
##   State
## 1    MN
## 2    MN
## 3    MN
## 4    MN
## 5    MN
## 6    MN
tail(Beer.afmg) # show last 6 observations
##      Brewery_id                    Name.x Beer_ID   ABV IBU
## 2405        556             Pilsner Ukiah      98 0.055  NA
## 2406        557  Heinnieweisse Weissebier      52 0.049  NA
## 2407        557           Snapperhead IPA      51 0.068  NA
## 2408        557         Moo Thunder Stout      50 0.049  NA
## 2409        557         Porkslap Pale Ale      49 0.043  NA
## 2410        558 Urban Wilderness Pale Ale      30 0.049  NA
##                        Style Ounces                        Name.y          City
## 2405         German Pilsener     12         Ukiah Brewing Company         Ukiah
## 2406              Hefeweizen     12       Butternuts Beer and Ale Garrattsville
## 2407            American IPA     12       Butternuts Beer and Ale Garrattsville
## 2408      Milk / Sweet Stout     12       Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA)     12       Butternuts Beer and Ale Garrattsville
## 2410        English Pale Ale     12 Sleeping Lady Brewing Company     Anchorage
##      State
## 2405    CA
## 2406    NY
## 2407    NY
## 2408    NY
## 2409    NY
## 2410    AK

Address Missing values

Address the missing values in each column.

Missing values (in ABV and IBU) were omitted due to several possible alternative circumstances. If we were to replace them with the median value, it would hold many statistics to their values (namely the median across all) but ethically would be unsound. We also could replace these values with zeroes when doing our calculations, which is more ethically right, but would skew the statistics.

Beer.afIBUna = Beer.afmg[!is.na(Beer.afmg$IBU),] #Omit IBU NAs
Beer.afABVna = Beer.afmg[!is.na(Beer.afmg$ABV),] #Omit ABV NAs
Beer.afna = Beer.afABVna[!is.na(Beer.afABVna$IBU),] #ALL NA omitted

Median comparison of ABV and IBU among states

Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare.

ABV.summary <- Beer.afABVna %>%  group_by(State) %>%  summarise(median = median(ABV)) #set ABV summary tibble
medianabvplot = ggplot(ABV.summary,aes(State,median,fill = State))+geom_bar(stat = "identity")
ggplotly(medianabvplot) #plot median ABV in plotly
IBU.summary <- Beer.afIBUna %>%  group_by(State) %>%  summarise(median = median(IBU)) #set IBU summary tibble
medianibuplot = ggplot(IBU.summary,aes(State,median,fill = State))+geom_bar(stat = "identity")
ggplotly(medianibuplot) #plot median IBU in plotly

Max ABV and IBU by state

Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer?

Beer.afABVna[which.max(Beer.afABVna$ABV),]$State # Highest ABV
## [1] "CO"
Beer.afIBUna[which.max(Beer.afIBUna$IBU),]$State # Highest IBU
## [1] "OR"

Distribution of ABV

Comment on the summary statistics and distribution of the ABV variable.

#summary statistics
stat.desc(Beer.afABVna)
##                Brewery_id Name.x      Beer_ID          ABV          IBU Style
## nbr.val      2.348000e+03     NA 2.348000e+03 2.348000e+03 1.405000e+03    NA
## nbr.null     0.000000e+00     NA 0.000000e+00 0.000000e+00 0.000000e+00    NA
## nbr.na       0.000000e+00     NA 0.000000e+00 0.000000e+00 9.430000e+02    NA
## min          1.000000e+00     NA 1.000000e+00 1.000000e-03 4.000000e+00    NA
## max          5.580000e+02     NA 2.692000e+03 1.280000e-01 1.380000e+02    NA
## range        5.570000e+02     NA 2.691000e+03 1.270000e-01 1.340000e+02    NA
## sum          5.429760e+05     NA 3.379606e+06 1.403480e+02 6.001200e+04    NA
## median       2.060000e+02     NA 1.463500e+03 5.600000e-02 3.500000e+01    NA
## mean         2.312504e+02     NA 1.439355e+03 5.977342e-02 4.271317e+01    NA
## SE.mean      3.223359e+00     NA 1.544999e+01 2.794636e-04 6.924162e-01    NA
## CI.mean.0.95 6.320927e+00     NA 3.029705e+01 5.480212e-04 1.358282e+00    NA
## var          2.439582e+04     NA 5.604729e+05 1.833786e-04 6.736135e+02    NA
## std.dev      1.561916e+02     NA 7.486474e+02 1.354173e-02 2.595407e+01    NA
## coef.var     6.754219e-01     NA 5.201269e-01 2.265511e-01 6.076362e-01    NA
##                    Ounces Name.y City State
## nbr.val      2.348000e+03     NA   NA    NA
## nbr.null     0.000000e+00     NA   NA    NA
## nbr.na       0.000000e+00     NA   NA    NA
## min          8.400000e+00     NA   NA    NA
## max          3.200000e+01     NA   NA    NA
## range        2.360000e+01     NA   NA    NA
## sum          3.191330e+04     NA   NA    NA
## median       1.200000e+01     NA   NA    NA
## mean         1.359170e+01     NA   NA    NA
## SE.mean      4.813807e-02     NA   NA    NA
## CI.mean.0.95 9.439756e-02     NA   NA    NA
## var          5.440958e+00     NA   NA    NA
## std.dev      2.332586e+00     NA   NA    NA
## coef.var     1.716185e-01     NA   NA    NA
#plot of ABV distribution
abvdistplot= ggplot(Beer.afABVna,mapping = aes(ABV,fill=State)) + geom_histogram(position = "stack", binwidth=0.003)
ggplotly(abvdistplot)

The ABV distribution is right-skewed which shows that craft brewers are providing a higher alcohol content than the 5-6% range, indicating a higher likelihood of a consumer preference in craft beers with a higher alcohol content.

Relationship between IBU and ABV

Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot. Make your best judgment of a relationship and EXPLAIN your answer.

#linear model plot
Beer.model = lm(IBU~ABV,data=Beer.afna) #set linear model
summary(Beer.model)
## 
## Call:
## lm(formula = IBU ~ ABV, data = Beer.afna)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -78.849 -11.977  -0.721  13.997  93.458 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -34.099      2.326  -14.66   <2e-16 ***
## ABV         1282.037     37.860   33.86   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.26 on 1403 degrees of freedom
## Multiple R-squared:  0.4497, Adjusted R-squared:  0.4493 
## F-statistic:  1147 on 1 and 1403 DF,  p-value: < 2.2e-16
lmplot = ggplot(Beer.afna,mapping = aes(x = ABV,y = IBU)) + geom_point() +
  geom_smooth(method = 'lm', linetype = "dashed", color = "darkred", fill="blue") + 
  stat_density_2d() #2d density estimation
ggplotly(lmplot)
## `geom_smooth()` using formula 'y ~ x'
#best fit model plot
abviburelplot = ggplot(Beer.afna,mapping = aes(x = ABV,y = IBU)) + geom_point() +
  geom_smooth(linetype = "dashed", color = "darkred", fill = "blue") + 
  stat_density_2d() #2d density estimation
ggplotly(abviburelplot)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
cor(Beer.afna$ABV,Beer.afna$IBU) #correlation coefficient between ABV and IBU
## [1] 0.6706215

Seen in the above plots, there is in fact a relationship between IBU and ABV, shown with a linear model and best fit model. With a correlation coefficient of 0.67; this shows a strong positive correlation between the two.

##k-NN Classification

Budweiser would also like to investigate the difference with respect to IBU and ABV between IPAs (India Pale Ales) and other types of Ale (any beer with “Ale” in its name other than IPA).
You decide to use KNN classification to investigate this relationship. Provide statistical evidence one way or the other. You can of course assume your audience is comfortable with percentages … KNN is very easy to understand conceptually.

Beer.ale=Beer.afna[str_detect(Beer.afna$Style,"(IPA|Ale)"),] #create data frame of IPAs and Ales
Beer.ale=na.omit(Beer.ale) #omit any NAs from data frame
Beer.ale$Type=ifelse(str_detect(Beer.ale$Style,"IPA"),"IPA","Ale") #set type to IPA or else Ale
set.seed(4) #set seed for random number generation
smp_size.ale=floor(0.7 * nrow(Beer.ale)) #set sample size for training indices
train.ind=sample(seq_len(nrow(Beer.ale)), size = smp_size.ale) #set training indices for training set and inversely test

Beer.train=Beer.ale[train.ind,] #create training set
Beer.train$IBU = scale(Beer.train$IBU) #scale IBU variable
Beer.train$ABV = scale(Beer.train$ABV) #scale ABV variable

Beer.test = Beer.ale[-train.ind,] #create test set
Beer.test$IBU = scale(Beer.test$IBU) #scale IBU variable
Beer.test$ABV = scale(Beer.test$ABV) #scale ABV variable

abvibuplot = ggplot(Beer.ale, aes(x = IBU, y = ABV,color = Type)) + geom_point() + ggtitle("Distribution of Bitterness and Alcohol Content in Ales and IPAs")
ggplotly(abvibuplot)
#k-NN model with k as square root of number of observations in training set
Beer.knn = knn(Beer.train[,c(4,5)],Beer.test[,c(4,5)],Beer.train$Type,k=sqrt(nrow(Beer.train)))

confusionMatrix(Beer.knn,as.factor(Beer.test$Type)) #confusion matrix to show statistics on model
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Ale IPA
##        Ale 151  16
##        IPA  29  88
##                                          
##                Accuracy : 0.8415         
##                  95% CI : (0.7938, 0.882)
##     No Information Rate : 0.6338         
##     P-Value [Acc > NIR] : 8.39e-15       
##                                          
##                   Kappa : 0.6674         
##                                          
##  Mcnemar's Test P-Value : 0.07364        
##                                          
##             Sensitivity : 0.8389         
##             Specificity : 0.8462         
##          Pos Pred Value : 0.9042         
##          Neg Pred Value : 0.7521         
##              Prevalence : 0.6338         
##          Detection Rate : 0.5317         
##    Detection Prevalence : 0.5880         
##       Balanced Accuracy : 0.8425         
##                                          
##        'Positive' Class : Ale            
## 

We have a working k-NN model that can predict IPAs and Ales with 84% accuracy when using ABV and IBU as predictors. This is useful in market research in order to determine what kind of ABV and IBUs to use in order to target the most craft consumers.

Other models

In addition, while you have decided to use KNN to investigate this relationship (KNN is required) you may also feel free to supplement your response to this question with any other methods or techniques you have learned. Creativity and alternative solutions are always encouraged.

# Naive Bayes model
Beer.nb = naiveBayes(Type~.,data=Beer.train) #set Naive Bayes model with training set
Beer.nb.pred = predict(Beer.nb,Beer.test[,c(4,5)]) #predict beers in test set
## Warning in predict.naiveBayes(Beer.nb, Beer.test[, c(4, 5)]): Type mismatch
## between training and new data for variable 'Brewery_id'. Did you use factors
## with numeric labels for training, and numeric values for new data?
## Warning in predict.naiveBayes(Beer.nb, Beer.test[, c(4, 5)]): Type mismatch
## between training and new data for variable 'Beer_ID'. Did you use factors with
## numeric labels for training, and numeric values for new data?
## Warning in predict.naiveBayes(Beer.nb, Beer.test[, c(4, 5)]): Type mismatch
## between training and new data for variable 'Ounces'. Did you use factors with
## numeric labels for training, and numeric values for new data?
confusionMatrix(Beer.nb.pred,as.factor(Beer.test$Type)) #confusion matrix to show statistics on model
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Ale IPA
##        Ale 153  19
##        IPA  27  85
##                                           
##                Accuracy : 0.838           
##                  95% CI : (0.7899, 0.8789)
##     No Information Rate : 0.6338          
##     P-Value [Acc > NIR] : 2.55e-14        
##                                           
##                   Kappa : 0.6566          
##                                           
##  Mcnemar's Test P-Value : 0.302           
##                                           
##             Sensitivity : 0.8500          
##             Specificity : 0.8173          
##          Pos Pred Value : 0.8895          
##          Neg Pred Value : 0.7589          
##              Prevalence : 0.6338          
##          Detection Rate : 0.5387          
##    Detection Prevalence : 0.6056          
##       Balanced Accuracy : 0.8337          
##                                           
##        'Positive' Class : Ale             
## 
# Extra possible results

alesstateplot = ggplot(Beer.ale,mapping=aes(x=State,fill=Type))+geom_bar()+ggtitle("ALE and IPA present in each state")
ggplotly(alesstateplot)
Beer.TypeChart = merge(Beer.afna,Beer.ale,by=c("Brewery_id","Beer_ID", "Name.x", "ABV", "IBU", 
                                               "Style", "Ounces", "Name.y", "City", "State"), all=TRUE)
Beer.TypeChart$Type[is.na(Beer.TypeChart$Type)]="Other"

beerprefstateplot = ggplot(Beer.TypeChart,mapping=aes(x=State,fill=Type))+
  geom_bar(position="fill")+
  ggtitle("Beer Preference by State")+
  ylab("Percentage")+
  scale_y_continuous(labels = scales::percent_format())
ggplotly(beerprefstateplot)